{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 首先准备好模块，然后自动打开浏览器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "from random import random\n",
    "import requests_html\n",
    "from requests_html import HTMLSession\n",
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 登陆打开浏览器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 185,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-185-d1c89fc3f1a1>:10: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,\n"
     ]
    }
   ],
   "source": [
    "wd = webdriver.Chrome()\n",
    "wd.get(\"https://www.baidu.com\")\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') \n",
    "\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 打开知网首页面\n",
    "- 由于我在学校用的是校园网，直接是学校登陆使用，所以不需要个人登陆"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://kns.cnki.net/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//*[@id=\"highSearch\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 188,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['CDwindow-ADB27244A79FB6CF6C32E68102163AF8', 'CDwindow-FC167709C69298BA47A9F0E526A3CED6']\n"
     ]
    }
   ],
   "source": [
    "print (driver.window_handles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-189-6c6d5ce6602d>:1: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('/html/body/div[3]/div[1]/div/ul[1]/li[1]/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 进入检索的页面，进行专业检索（开始挖取第一页的文章）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 专业检索\n",
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/ul/li[4]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "metadata": {},
   "outputs": [],
   "source": [
    "#勾选期刊类型\n",
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[2]/input')\n",
    "element.click()#SCI来源期刊\n",
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[4]/input')\n",
    "element.click()#北大核心\n",
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[5]/input')\n",
    "element.click()#CSSCI\n",
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[6]/input')\n",
    "element.click()#CSCD\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "metadata": {},
   "outputs": [],
   "source": [
    "AI_dashuju_query = '( SU=\"大数据\" AND KY=\"人工智能\")'\n",
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/textarea')\n",
    "element.clear()\n",
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/textarea').send_keys(AI_dashuju_query)#输入关键词\n",
    "#点击检索\n",
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[2]/input')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 194,
   "metadata": {},
   "outputs": [],
   "source": [
    "#点击50篇全选进行下载\n",
    "element = driver.find_element_by_xpath('/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/div[2]/div/div/div/i')\n",
    "element.click()\n",
    "element = driver.find_element_by_xpath('/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/div[2]/div/div/ul/li[3]/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>绿色理念下智慧零售创新发展路径</td>\n",
       "      <td>翟璇</td>\n",
       "      <td>商业经济研究</td>\n",
       "      <td>2021-06-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>智能化金融监管：模型框架、边缘约束和实践策略  网络首发</td>\n",
       "      <td>乔宇锋</td>\n",
       "      <td>南方金融</td>\n",
       "      <td>2021-06-29 17:04</td>\n",
       "      <td>NaN</td>\n",
       "      <td>105.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>“智能+”时代的现代文化产业体系：挑战与重塑</td>\n",
       "      <td>解学芳; 雷文宣</td>\n",
       "      <td>深圳大学学报(人文社会科学版)</td>\n",
       "      <td>2021-06-29 15:45</td>\n",
       "      <td>NaN</td>\n",
       "      <td>65.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>城市暴雨内涝综述：特征、机理、数据与方法</td>\n",
       "      <td>黄华兵; 王先伟; 柳林</td>\n",
       "      <td>地理科学进展</td>\n",
       "      <td>2021-06-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>管道数字孪生体模型的构建及应用</td>\n",
       "      <td>陈斯迅;李在蓉;王禹钦;关红亮;高丽</td>\n",
       "      <td>油气储运</td>\n",
       "      <td>2021-06-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>220.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>大数据智能时代地图学课程内容改革探索</td>\n",
       "      <td>刘慧敏; 邓敏; 刘宝举; 陈杰</td>\n",
       "      <td>测绘通报</td>\n",
       "      <td>2021-06-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>24.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>关于城市大脑未来形态的思考  网络首发</td>\n",
       "      <td>胡坚波</td>\n",
       "      <td>人民论坛·学术前沿</td>\n",
       "      <td>2021-06-18 11:09</td>\n",
       "      <td>NaN</td>\n",
       "      <td>385.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>人工智能审计应用的国际进展</td>\n",
       "      <td>吴勇; 余洁; 王尚纯; 张超</td>\n",
       "      <td>中国注册会计师</td>\n",
       "      <td>2021-06-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>236.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>人工智能审计应用的国际进展</td>\n",
       "      <td>吴勇; 余洁; 王尚纯; 张超</td>\n",
       "      <td>中国注册会计师</td>\n",
       "      <td>2021-06-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>236.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>地理空间认知理论与地图工具的发展</td>\n",
       "      <td>郑束蕾</td>\n",
       "      <td>测绘学报</td>\n",
       "      <td>2021-06-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11</td>\n",
       "      <td>科学数据智能：人工智能在科学发现中的机遇与挑战  网络首发</td>\n",
       "      <td>孟小峰</td>\n",
       "      <td>中国科学基金</td>\n",
       "      <td>2021-06-11 18:01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>12</td>\n",
       "      <td>测绘大数据时代数据处理理论面临的挑战与发展  网络首发</td>\n",
       "      <td>朱建军; 宋迎春; 胡俊; 邹滨; 吴立新</td>\n",
       "      <td>武汉大学学报(信息科学版)</td>\n",
       "      <td>2021-06-09 10:26</td>\n",
       "      <td>NaN</td>\n",
       "      <td>281.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>13</td>\n",
       "      <td>人工智能助推教师队伍建设途径与方法研究</td>\n",
       "      <td>钟绍春; 钟卓; 张琢</td>\n",
       "      <td>中国电化教育</td>\n",
       "      <td>2021-06-08</td>\n",
       "      <td>NaN</td>\n",
       "      <td>431.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>14</td>\n",
       "      <td>人工智能助推教师队伍建设途径与方法研究</td>\n",
       "      <td>钟绍春; 钟卓; 张琢</td>\n",
       "      <td>中国电化教育</td>\n",
       "      <td>2021-06-08</td>\n",
       "      <td>NaN</td>\n",
       "      <td>431.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>15</td>\n",
       "      <td>数字化行政方式的权力正当性检视</td>\n",
       "      <td>展鹏贺</td>\n",
       "      <td>中国法学</td>\n",
       "      <td>2021-06-07</td>\n",
       "      <td>NaN</td>\n",
       "      <td>475.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>16</td>\n",
       "      <td>数字化行政方式的权力正当性检视</td>\n",
       "      <td>展鹏贺</td>\n",
       "      <td>中国法学</td>\n",
       "      <td>2021-06-07</td>\n",
       "      <td>NaN</td>\n",
       "      <td>475.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>17</td>\n",
       "      <td>人工智能在医疗领域的应用与思考</td>\n",
       "      <td>王笛; 赵靖; 金明超; 刘婧; 熊伟</td>\n",
       "      <td>中国医院管理</td>\n",
       "      <td>2021-06-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>192.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>人工智能钻井技术研究方法及其实践  网络首发</td>\n",
       "      <td>杨传书; 李昌盛; 孙旭东; 黄历铭; 张好林</td>\n",
       "      <td>石油钻探技术</td>\n",
       "      <td>2021-06-04 10:36</td>\n",
       "      <td>NaN</td>\n",
       "      <td>265.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>19</td>\n",
       "      <td>城市大脑：城市管理创新的智慧工具  网络首发</td>\n",
       "      <td>陆军</td>\n",
       "      <td>人民论坛·学术前沿</td>\n",
       "      <td>2021-06-03 15:38</td>\n",
       "      <td>NaN</td>\n",
       "      <td>562.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>20</td>\n",
       "      <td>健康中国战略下保险科技赋能商业健康保险发展研究</td>\n",
       "      <td>胡芳; 彭琛; 陈小红</td>\n",
       "      <td>西南金融</td>\n",
       "      <td>2021-06-01 16:20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>488.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Unnamed: 0                             篇名                       作者  \\\n",
       "0            1                绿色理念下智慧零售创新发展路径                       翟璇   \n",
       "1            2   智能化金融监管：模型框架、边缘约束和实践策略  网络首发                      乔宇锋   \n",
       "2            3         “智能+”时代的现代文化产业体系：挑战与重塑                 解学芳; 雷文宣   \n",
       "3            4           城市暴雨内涝综述：特征、机理、数据与方法             黄华兵; 王先伟; 柳林   \n",
       "4            5                管道数字孪生体模型的构建及应用       陈斯迅;李在蓉;王禹钦;关红亮;高丽   \n",
       "5            6             大数据智能时代地图学课程内容改革探索         刘慧敏; 邓敏; 刘宝举; 陈杰   \n",
       "6            7            关于城市大脑未来形态的思考  网络首发                      胡坚波   \n",
       "7            8                  人工智能审计应用的国际进展          吴勇; 余洁; 王尚纯; 张超   \n",
       "8            9                  人工智能审计应用的国际进展          吴勇; 余洁; 王尚纯; 张超   \n",
       "9           10               地理空间认知理论与地图工具的发展                      郑束蕾   \n",
       "10          11  科学数据智能：人工智能在科学发现中的机遇与挑战  网络首发                      孟小峰   \n",
       "11          12    测绘大数据时代数据处理理论面临的挑战与发展  网络首发    朱建军; 宋迎春; 胡俊; 邹滨; 吴立新   \n",
       "12          13            人工智能助推教师队伍建设途径与方法研究              钟绍春; 钟卓; 张琢   \n",
       "13          14            人工智能助推教师队伍建设途径与方法研究              钟绍春; 钟卓; 张琢   \n",
       "14          15                数字化行政方式的权力正当性检视                      展鹏贺   \n",
       "15          16                数字化行政方式的权力正当性检视                      展鹏贺   \n",
       "16          17                人工智能在医疗领域的应用与思考      王笛; 赵靖; 金明超; 刘婧; 熊伟   \n",
       "17          18         人工智能钻井技术研究方法及其实践  网络首发  杨传书; 李昌盛; 孙旭东; 黄历铭; 张好林   \n",
       "18          19         城市大脑：城市管理创新的智慧工具  网络首发                       陆军   \n",
       "19          20        健康中国战略下保险科技赋能商业健康保险发展研究              胡芳; 彭琛; 陈小红   \n",
       "\n",
       "                 刊名              发表时间  被引     下载  操作  \n",
       "0            商业经济研究        2021-06-30 NaN    6.0  下载  \n",
       "1              南方金融  2021-06-29 17:04 NaN  105.0  下载  \n",
       "2   深圳大学学报(人文社会科学版)  2021-06-29 15:45 NaN   65.0  下载  \n",
       "3            地理科学进展        2021-06-28 NaN    NaN  下载  \n",
       "4              油气储运        2021-06-25 NaN  220.0  下载  \n",
       "5              测绘通报        2021-06-25 NaN   24.0  下载  \n",
       "6         人民论坛·学术前沿  2021-06-18 11:09 NaN  385.0  下载  \n",
       "7           中国注册会计师        2021-06-15 NaN  236.0  下载  \n",
       "8           中国注册会计师        2021-06-15 NaN  236.0  下载  \n",
       "9              测绘学报        2021-06-15 NaN   11.0  下载  \n",
       "10           中国科学基金  2021-06-11 18:01 NaN    NaN  下载  \n",
       "11    武汉大学学报(信息科学版)  2021-06-09 10:26 NaN  281.0  下载  \n",
       "12           中国电化教育        2021-06-08 NaN  431.0  下载  \n",
       "13           中国电化教育        2021-06-08 NaN  431.0  下载  \n",
       "14             中国法学        2021-06-07 NaN  475.0  下载  \n",
       "15             中国法学        2021-06-07 NaN  475.0  下载  \n",
       "16           中国医院管理        2021-06-05 NaN  192.0  下载  \n",
       "17           石油钻探技术  2021-06-04 10:36 NaN  265.0  下载  \n",
       "18        人民论坛·学术前沿  2021-06-03 15:38 NaN  562.0  下载  \n",
       "19             西南金融  2021-06-01 16:20 NaN  488.0  下载  "
      ]
     },
     "execution_count": 195,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看的页面信息\n",
    "element = driver.find_element_by_id('gridTable')\n",
    "page_html = element.get_attribute('innerHTML')\n",
    "page_html\n",
    "pd.read_html(page_html)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 因为我们需要制作可视化图标，所以我们需要导出数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 导出endnote的50页文本"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将循环时需要点击的元素做成列表\n",
    "piliang_list = []\n",
    "piliang_list.append('//*[@id=\"gridTable\"]/div[1]/div[2]/div[1]/a')\n",
    "piliang_list.append(\"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/div[1]/label/input\")\n",
    "piliang_list.append(\"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/ul[1]/li[2]/i\")\n",
    "piliang_list.append(\"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/ul[1]/li[2]/ul/li[1]/a\")\n",
    "piliang_list.append(\"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/ul[1]/li[2]/ul/li[1]/ul/li[9]/a\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "for page in range(0,26):# 因为最后一页没有下一页的按键，所以在第一行代码的页面循环中只需填写到最后一页的上一页页数即可，如此次爬取的全部为27页，则填写（0,26），循环到第26页的最后一个下一页按键即可。\n",
    "    for i in piliang_list:\n",
    "        element = driver.find_element_by_xpath(i)\n",
    "        element.click()\n",
    "        time.sleep(2)\n",
    "    driver.switch_to.window(driver.window_handles[2])#定位新窗口\n",
    "    time.sleep(10)\n",
    "    driver.find_element_by_xpath('//*[@id=\"litotxt\"]/a').click()#点击导出\n",
    "    time.sleep(10)\n",
    "    driver.close()#关闭新窗口\n",
    "    time.sleep(4)\n",
    "    driver.switch_to.window(driver.window_handles[1])#定位回原窗口\n",
    "    driver.find_element_by_xpath('//*[@id=\"PageNext\"]').click()#点击下一页\n",
    "    time.sleep(12)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### refworks分每页50篇各自循环导出"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 因为导出有问题在可视化上展示不到，所以现在不展示这一部分的代码"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## PDF下载"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 判断是否需要验证码，如果不需要的直接往下运行，需要就连接图鉴连接到本地进行识别"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 196,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 封装判断验证码\n",
    "def img_have():\n",
    "    try:\n",
    "        driver.find_element_by_xpath('//*[@id=\"vImg\"]')#验证码图片\n",
    "        return True#为真，即有验证码图片存在\n",
    "    except:\n",
    "        return False#反之为空则没有验证码，返回False"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 当验证码出现时，截图存到本地让图鉴识别出来"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 197,
   "metadata": {},
   "outputs": [],
   "source": [
    "def shuru():\n",
    "    yanzhengma = base64_api(uname, pwd, img, typeid)\n",
    "    driver.find_element_by_xpath('//*[@id=\"vcode\"]').clear()\n",
    "    element = driver.find_element_by_xpath('//*[@id=\"vcode\"]').send_keys(yanzhengma)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 当遇到验证码时，用图鉴识别"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 198,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 封装图鉴API\n",
    "uname = \"Cnomeshgh\"\n",
    "pwd = \"*123Cnomeshgh\"\n",
    "typeid  = \"3\"\n",
    "def base64_api(uname, pwd, img, typeid):\n",
    "    with open('D:\\2020年网新作业\\2021网新\\web数据挖掘\\验证图片\\screenImg.png', 'rb') as f:    \n",
    "        base64_data = base64.b64encode(f.read())\n",
    "        img = base64_data.decode()\n",
    "    data = {\"username\": uname, \"password\": pwd, \"typeid\": typeid, \"image\": img}\n",
    "    time.sleep(1)\n",
    "    result = json.loads(requests.post(\"http://api.ttshitu.com/predict\", json=data).text)\n",
    "    if result['success']:\n",
    "        return result[\"data\"][\"result\"]\n",
    "    else:\n",
    "        return result[\"message\"]\n",
    "    return \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 199,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 如果出现验证码，直接通过图鉴的识别填上去\n",
    "def shuru():\n",
    "    yanzhengma = base64_api(uname, pwd, img, typeid)\n",
    "    driver.find_element_by_xpath('//*[@id=\"vcode\"]').clear()\n",
    "    element = driver.find_element_by_xpath('//*[@id=\"vcode\"]').send_keys(yanzhengma)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 200,
   "metadata": {},
   "outputs": [],
   "source": [
    "def wenjian2():\n",
    "    path = r'D:\\2020年网新作业\\2021网新\\web数据挖掘\\pdf'#（需在selenium浏览器更改下载文件存储的地址）\n",
    "    file2 = int(len([lists for lists in os.listdir(path) if os.path.isfile(os.path.join(path, lists))]))#通过len和int将验证码输入后的文件夹里的文件数量进行检测，获取数量为输入验证码后文件夹里的文件量，需调用os模块\n",
    "    return file2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 201,
   "metadata": {},
   "outputs": [],
   "source": [
    "#返回列表页 报错的时候重新运行时方便用\n",
    "driver.switch_to.window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 202,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-202-7b10395f91a5>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      4\u001b[0m         \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element_by_xpath\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpdf_xpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclick\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;31m#点击文章进入文章详情页\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m         \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m         \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mswitch_to\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwindow\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwindow_handles\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;31m#定位新窗口\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      7\u001b[0m         \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element_by_xpath\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'//*[@id=\"pdfDown\"]'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclick\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;31m#点击下载\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      8\u001b[0m         \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mwindow_handles\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    722\u001b[0m         \"\"\"\n\u001b[0;32m    723\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mw3c\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 724\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mCommand\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mW3C_GET_WINDOW_HANDLES\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'value'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    725\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    726\u001b[0m             \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mCommand\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mGET_WINDOW_HANDLES\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'value'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m    317\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    318\u001b[0m         \u001b[0mparams\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_wrap_value\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 319\u001b[1;33m         \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommand_executor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdriver_command\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    320\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    321\u001b[0m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror_handler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcheck_response\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\selenium\\webdriver\\remote\\remote_connection.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, command, params)\u001b[0m\n\u001b[0;32m    372\u001b[0m         \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mutils\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdump_json\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    373\u001b[0m         \u001b[0murl\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'%s%s'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_url\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 374\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_request\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand_info\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    375\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    376\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_request\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\selenium\\webdriver\\remote\\remote_connection.py\u001b[0m in \u001b[0;36m_request\u001b[1;34m(self, method, url, body)\u001b[0m\n\u001b[0;32m    395\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    396\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mkeep_alive\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 397\u001b[1;33m             \u001b[0mresp\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_conn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    398\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    399\u001b[0m             \u001b[0mstatuscode\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mresp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstatus\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\urllib3\\request.py\u001b[0m in \u001b[0;36mrequest\u001b[1;34m(self, method, url, fields, headers, **urlopen_kw)\u001b[0m\n\u001b[0;32m     73\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     74\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mmethod\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_encode_url_methods\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 75\u001b[1;33m             return self.request_encode_url(\n\u001b[0m\u001b[0;32m     76\u001b[0m                 \u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0murlopen_kw\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     77\u001b[0m             )\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\urllib3\\request.py\u001b[0m in \u001b[0;36mrequest_encode_url\u001b[1;34m(self, method, url, fields, headers, **urlopen_kw)\u001b[0m\n\u001b[0;32m     95\u001b[0m             \u001b[0murl\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;34m\"?\"\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0murlencode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfields\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     96\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 97\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0murlopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mextra_kw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     98\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     99\u001b[0m     def request_encode_body(\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\urllib3\\poolmanager.py\u001b[0m in \u001b[0;36murlopen\u001b[1;34m(self, method, url, redirect, **kw)\u001b[0m\n\u001b[0;32m    334\u001b[0m             \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0murlopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    335\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 336\u001b[1;33m             \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0murlopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mu\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest_uri\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    337\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    338\u001b[0m         \u001b[0mredirect_location\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mredirect\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_redirect_location\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\urllib3\\connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[1;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[0;32m    668\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    669\u001b[0m             \u001b[1;31m# Make the request on the httplib connection object.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 670\u001b[1;33m             httplib_response = self._make_request(\n\u001b[0m\u001b[0;32m    671\u001b[0m                 \u001b[0mconn\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    672\u001b[0m                 \u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\urllib3\\connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m    424\u001b[0m                     \u001b[1;31m# Python 3 (including for exceptions like SystemExit).\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    425\u001b[0m                     \u001b[1;31m# Otherwise it looks like a bug in the code.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 426\u001b[1;33m                     \u001b[0msix\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mraise_from\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    427\u001b[0m         \u001b[1;32mexcept\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mSocketTimeout\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mBaseSSLError\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mSocketError\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    428\u001b[0m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_raise_timeout\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0merr\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtimeout_value\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mread_timeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\urllib3\\packages\\six.py\u001b[0m in \u001b[0;36mraise_from\u001b[1;34m(value, from_value)\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\urllib3\\connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m    419\u001b[0m                 \u001b[1;31m# Python 3\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    420\u001b[0m                 \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 421\u001b[1;33m                     \u001b[0mhttplib_response\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    422\u001b[0m                 \u001b[1;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    423\u001b[0m                     \u001b[1;31m# Remove the TypeError from the exception chain in\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\http\\client.py\u001b[0m in \u001b[0;36mgetresponse\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m   1330\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1331\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1332\u001b[1;33m                 \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbegin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1333\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1334\u001b[0m                 \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\http\\client.py\u001b[0m in \u001b[0;36mbegin\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    301\u001b[0m         \u001b[1;31m# read until we get a non-100 response\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    302\u001b[0m         \u001b[1;32mwhile\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 303\u001b[1;33m             \u001b[0mversion\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreason\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_read_status\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    304\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mstatus\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[0mCONTINUE\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    305\u001b[0m                 \u001b[1;32mbreak\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\http\\client.py\u001b[0m in \u001b[0;36m_read_status\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    262\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    263\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_read_status\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 264\u001b[1;33m         \u001b[0mline\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_MAXLINE\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"iso-8859-1\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    265\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mline\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m>\u001b[0m \u001b[0m_MAXLINE\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    266\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0mLineTooLong\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"status line\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[1;34m(self, b)\u001b[0m\n\u001b[0;32m    667\u001b[0m         \u001b[1;32mwhile\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    668\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 669\u001b[1;33m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    670\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    671\u001b[0m                 \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "for page in range(1,4): #先进行三页循环，可自行选择爬取页数\n",
    "    for link in range(20,50):#可选择从一页中的第几篇文章开始挖取到第几篇文章结束挖取\n",
    "        pdf_xpath = '//*[@id=\"gridTable\"]/table/tbody/tr[{}]/td[2]/a'.format(link)\n",
    "        driver.find_element_by_xpath(pdf_xpath).click()#点击文章进入文章详情页\n",
    "        time.sleep(1)\n",
    "        driver.switch_to.window(driver.window_handles[2])#定位新窗口\n",
    "        driver.find_element_by_xpath('//*[@id=\"pdfDown\"]').click()#点击下载\n",
    "        time.sleep(3)\n",
    "        qiehuan()#切换到最新页面\n",
    "        time.sleep(4)\n",
    "        img_have()#判断是否弹出验证码窗口\n",
    "        abc = img_have()\n",
    "        if abc is True:#如果有验证码窗口\n",
    "            img()#爬取验证码\n",
    "            time.sleep(2)\n",
    "            base64_api(uname, pwd, img, typeid)#调用图鉴api识别验证码内容\n",
    "            shuru()#填写识别出来的验证码内容\n",
    "            wenjian1()#检测下载文件夹内的文件数量\n",
    "            file = wenjian1()\n",
    "            time.sleep(1)\n",
    "            driver.find_element_by_xpath('/html/body/div/form/dl/dd/button').click()#点击验证码提交\n",
    "            wenjian2()#检测点击下载后文件夹内的文件数量\n",
    "            file2 = wenjian2()\n",
    "            time.sleep(1)\n",
    "            wenjian3()#判断下载文件夹内的文件数量是否变化\n",
    "            abc2 = wenjian3()\n",
    "            while abc2 == False:#如果数量没有增加，则返回False，判断等于False，则重新进行验证码爬取识别以及输入并再次检测\n",
    "                img()#爬取验证码\n",
    "                base64_api(uname, pwd, img, typeid)#调用图鉴api识别验证码内容\n",
    "                shuru()#填写识别出来的验证码内容\n",
    "                wenjian1()#检测下载文件夹内的文件数量\n",
    "                file = wenjian1()\n",
    "                driver.find_element_by_xpath('/html/body/div/form/dl/dd/button').click()#点击验证码提交\n",
    "                wenjian2()#检测点击下载后文件夹内的文件数量\n",
    "                file2 = wenjian2()\n",
    "                wenjian3()#判断下载文件夹内的文件数量是否变化\n",
    "                abc2 = wenjian3()\n",
    "                if abc2 == True:#如果判断等于True，则代表输入正确下载成功\n",
    "                    break\n",
    "            true()#下载成功，情况正确，关闭窗口，返回原窗口，进行下一篇文章PDF原文文件下载   \n",
    "        else:#没有验证码窗口，则直接下载原文文件，然后关闭详情页，返回文章集合页，继续下一篇PDF原文文件下载\n",
    "            qiehuan()\n",
    "            driver.close()\n",
    "            driver.switch_to.window(driver.window_handles[1])\n",
    "            time.sleep(1)\n",
    "    driver.find_element"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
